if (!require("BiocManager"))
install.packages("BiocManager")
## Loading required package: BiocManager
## Warning: package 'BiocManager' was built under R version 4.4.3
## Bioconductor version '3.20' is out-of-date; the current release version '3.21'
## is available with R version '4.5'; see https://bioconductor.org/install
BiocManager::install("maftools")
## Bioconductor version 3.20 (BiocManager 1.30.25), R 4.4.1 (2024-06-14 ucrt)
## Warning: package(s) not installed when version(s) same as or greater than current; use
## `force = TRUE` to re-install: 'maftools'
## Installation paths not writeable, unable to update packages
## path: C:/Program Files/R/R-4.4.1/library
## packages:
## boot, class, cluster, foreign, KernSmooth, lattice, MASS, Matrix, mgcv,
## nlme, nnet, rpart, spatial, survival
## Old packages: 'BiocManager', 'BiocParallel', 'cli', 'data.table', 'Deriv',
## 'evaluate', 'ggforce', 'ggnewscale', 'pheatmap', 'rlang', 'RPostgreSQL',
## 'RSQLite', 'tibble', 'xfun'
library(maftools)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.4.3
library(pheatmap)
## Warning: package 'pheatmap' was built under R version 4.4.3
anno_df <- annovarToMaf(
annovar = "combined_trunc_missense_families.hg19_multianno.txt",
Center = "CSI-NUS",
refBuild = "hg19",
tsbCol = "Samples",
table = "refGene",
sep = "\t"
)
## -Reading annovar files
## --Extracting tx, exon, txchange and aa-change
## -Adding Variant_Type
## Finished in 1.590s elapsed (1.410s cpu)
families_list <- list(
"18" = c("HPC18", "HPC109", "HPC505"),
"25" = c("HPC8", "HPC25"),
"29" = c("HPC29", "HPC77", "HPC84"),
"32" = c("HPC31", "HPC32"),
"33" = c("HPC33", "HPC39", "HPC56"),
"52" = c("HPC52", "HPC417"),
"57" = c("HPC57", "HPC79", "HPC80"),
"62" = c("HPC21", "HPC62"),
"67" = c("HPC67", "HPC110"),
"102" = c("HPC102", "HPC107"),
"112" = c("HPC112", "HPC124"),
"120" = c("HPC120", "HPC397", "HPC488"),
"123" = c("HPC123", "HPC206", "HPC495"),
"136" = c("HPC136", "HPC502"),
"164" = c("HPC164", "HPC486"),
"172" = c("HPC172", "HPC489"),
"176" = c("HPC176", "HPC212"),
"181" = c("HPC181", "HPC525"),
"192" = c("HPC192", "HPC209"),
"199" = c("HPC199", "HPC484"),
"201" = c("HPC201", "HPC520"),
"204" = c("HPC204", "HPC503"),
"213" = c("HPC213", "HPC509"),
"214" = c("HPC214", "HPC261"),
"220" = c("HPC220", "HPC528"),
"229" = c("HPC229", "HPC401"),
"232" = c("HPC232", "HPC529"),
"234" = c("HPC234", "HPC518"),
"241" = c("HPC241", "HPC491"),
"258" = c("HPC128", "HPC258"),
"259" = c("HPC259", "HPC521"),
"264" = c("HPC210", "HPC264"),
"267" = c("HPC267", "HPC514"),
"282" = c("HPC282", "HPC511"),
"304" = c("HPC304", "HPC459"),
"325" = c("HPC193", "HPC325"),
"328" = c("HPC328", "HPC513"),
"329" = c("HPC329", "HPC506"),
"331" = c("HPC331", "HPC482"),
"387" = c("HPC387", "HPC516"),
"420" = c("HPC420", "HPC507"),
"460" = c("HPC460", "HPC522"),
"470" = c("HPC470", "HPC512"),
"510" = c("HPC114", "HPC510"),
"524" = c("HPC524", "HPC527")
)
filtered_anno_df <- anno_df %>%
as_tibble() %>%
mutate(
AF_nfe_num = suppressWarnings(as.numeric(AF_nfe)),
ExAC_nontcga_NFE_num = suppressWarnings(as.numeric(ExAC_nontcga_NFE)),
AF_popmax_num = suppressWarnings(as.numeric(AF_popmax))
) %>%
filter(
AF_nfe == "." | AF_nfe_num <= 0.01,
Family != ".",
Variant_Classification != "Unknown",
Variant_Classification != "Translation_Start_Site",
!CLNSIG %in% c("Benign", "Likely_benign", "Benign/Likely_benign"),
ExAC_nontcga_NFE == "." | ExAC_nontcga_NFE_num <= 0.01,
AF_popmax == "." | AF_popmax_num < 0.01
) %>%
dplyr::select(-AF_nfe_num, -ExAC_nontcga_NFE_num, -AF_popmax_num)
nrow(filtered_anno_df)
## [1] 552
#separar por Sample:
expanded_s_anno_df <- filtered_anno_df %>%
separate_rows(Tumor_Sample_Barcode, sep = ",") %>%
mutate(Tumor_Sample_Barcode = trimws(Tumor_Sample_Barcode))
nrow(expanded_s_anno_df)
## [1] 2139
library(data.table)
## Warning: package 'data.table' was built under R version 4.4.3
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(maftools)
maf_object <- read.maf(
maf = expanded_s_anno_df,
isTCGA = FALSE
)
## -Validating
## -Summarizing
## -Processing clinical data
## --Missing clinical data
## -Finished in 0.140s elapsed (0.070s cpu)
# cria sample2fam
sample2fam <- do.call(rbind, lapply(names(families_list), function(fam) {
data.frame(
Tumor_Sample_Barcode = families_list[[fam]],
Family = fam,
stringsAsFactors = FALSE
)
}))
sample2fam <- unique(sample2fam) # remove duplicados
# extrai clinicalData e faz merge
clin <- getClinicalData(maf_object)
clin2 <- merge(
clin,
sample2fam,
by = "Tumor_Sample_Barcode",
all.x = TRUE, # mantém todas amostras do MAF
sort = FALSE
)
# atribui de volta ao slot clínico
maf_object@clinical.data <- clin2
oncoplot(
maf = maf_object,
clinicalFeatures = "Family",
sortByAnnotation = TRUE,
showTumorSampleBarcodes = FALSE,
top = 15
)
plotmafSummary(maf = maf_object, rmOutlier = TRUE, addStat = 'median', dashboard = TRUE, titvRaw = FALSE)
mafbarplot(maf_object)
getSampleSummary(maf_object)
## Tumor_Sample_Barcode Frame_Shift_Del Frame_Shift_Ins In_Frame_Del
## <fctr> <int> <int> <int>
## 1: HPC124 7 3 5
## 2: HPC112 7 2 5
## 3: HPC79 2 5 10
## 4: HPC109 4 4 5
## 5: HPC495 5 1 7
## 6: HPC120 5 4 4
## 7: HPC201 4 1 3
## 8: HPC397 6 2 6
## 9: HPC164 4 1 6
## 10: HPC520 2 1 2
## 11: HPC84 3 3 2
## 12: HPC486 2 0 5
## 13: HPC102 5 5 5
## 14: HPC206 3 1 4
## 15: HPC80 0 3 8
## 16: HPC420 0 4 4
## 17: HPC488 2 5 4
## 18: HPC507 0 4 4
## 19: HPC18 5 3 3
## 20: HPC39 3 3 4
## 21: HPC128 5 1 3
## 22: HPC232 4 3 3
## 23: HPC25 5 2 4
## 24: HPC264 2 2 3
## 25: HPC77 2 3 3
## 26: HPC229 3 2 5
## 27: HPC241 2 3 7
## 28: HPC258 4 1 5
## 29: HPC491 2 2 7
## 30: HPC505 5 1 1
## 31: HPC107 3 5 3
## 32: HPC123 1 0 8
## 33: HPC210 2 1 2
## 34: HPC234 5 2 4
## 35: HPC259 2 1 1
## 36: HPC482 1 1 3
## 37: HPC521 4 1 2
## 38: HPC529 4 4 4
## 39: HPC114 5 1 2
## 40: HPC204 5 1 2
## 41: HPC261 3 0 3
## 42: HPC331 2 1 2
## 43: HPC401 1 2 6
## 44: HPC509 6 5 4
## 45: HPC528 4 3 2
## 46: HPC29 1 3 4
## 47: HPC503 5 1 2
## 48: HPC510 6 2 1
## 49: HPC192 2 1 4
## 50: HPC209 3 0 5
## 51: HPC32 2 1 3
## 52: HPC459 2 2 4
## 53: HPC522 2 2 2
## 54: HPC212 4 3 2
## 55: HPC220 3 2 4
## 56: HPC329 5 2 1
## 57: HPC33 2 1 3
## 58: HPC56 2 1 3
## 59: HPC57 1 4 4
## 60: HPC214 1 1 3
## 61: HPC304 1 2 2
## 62: HPC8 3 2 4
## 63: HPC176 5 1 1
## 64: HPC21 2 1 4
## 65: HPC31 1 2 2
## 66: HPC460 2 3 4
## 67: HPC506 2 2 1
## 68: HPC518 4 1 4
## 69: HPC193 0 4 4
## 70: HPC213 4 4 1
## 71: HPC387 4 1 3
## 72: HPC524 4 2 1
## 73: HPC62 1 5 2
## 74: HPC267 2 3 4
## 75: HPC325 0 2 4
## 76: HPC514 2 1 2
## 77: HPC527 3 0 2
## 78: HPC172 1 0 2
## 79: HPC417 2 2 1
## 80: HPC489 1 0 0
## 81: HPC52 3 2 0
## 82: HPC110 2 1 3
## 83: HPC181 1 3 3
## 84: HPC470 0 2 3
## 85: HPC512 1 2 1
## 86: HPC282 1 1 3
## 87: HPC511 1 1 2
## 88: HPC67 2 0 1
## 89: HPC136 0 1 2
## 90: HPC516 3 1 2
## 91: HPC502 0 0 3
## 92: HPC484 0 2 1
## 93: HPC513 1 1 2
## 94: HPC525 1 0 1
## 95: HPC328 0 1 1
## 96: HPC199 1 0 1
## Tumor_Sample_Barcode Frame_Shift_Del Frame_Shift_Ins In_Frame_Del
## In_Frame_Ins Missense_Mutation Nonsense_Mutation Nonstop_Mutation total
## <int> <int> <int> <int> <num>
## 1: 4 12 7 0 38
## 2: 4 12 6 0 36
## 3: 4 8 7 0 36
## 4: 2 14 5 0 34
## 5: 2 15 4 0 34
## 6: 6 10 3 1 33
## 7: 6 10 8 1 33
## 8: 4 10 4 1 33
## 9: 5 7 9 0 32
## 10: 8 8 10 1 32
## 11: 5 8 11 0 32
## 12: 5 11 8 0 31
## 13: 5 6 4 0 30
## 14: 2 16 4 0 30
## 15: 5 7 7 0 30
## 16: 3 11 7 0 29
## 17: 5 8 4 1 29
## 18: 4 9 8 0 29
## 19: 2 11 4 0 28
## 20: 5 8 5 0 28
## 21: 3 7 8 0 27
## 22: 4 5 8 0 27
## 23: 6 5 5 0 27
## 24: 4 8 8 0 27
## 25: 3 8 8 0 27
## 26: 5 5 6 0 26
## 27: 2 5 7 0 26
## 28: 2 7 7 0 26
## 29: 2 6 7 0 26
## 30: 2 9 8 0 26
## 31: 4 6 4 0 25
## 32: 2 9 5 0 25
## 33: 3 9 8 0 25
## 34: 4 3 7 0 25
## 35: 8 5 8 0 25
## 36: 3 9 8 0 25
## 37: 7 6 5 0 25
## 38: 2 5 6 0 25
## 39: 5 7 4 0 24
## 40: 3 10 3 0 24
## 41: 9 4 4 1 24
## 42: 2 10 7 0 24
## 43: 3 7 5 0 24
## 44: 3 3 3 0 24
## 45: 5 5 5 0 24
## 46: 1 6 8 0 23
## 47: 2 9 4 0 23
## 48: 4 5 5 0 23
## 49: 1 6 8 0 22
## 50: 2 6 6 0 22
## 51: 2 9 5 0 22
## 52: 2 8 4 0 22
## 53: 2 9 5 0 22
## 54: 3 5 4 0 21
## 55: 2 5 5 0 21
## 56: 2 5 6 0 21
## 57: 5 7 3 0 21
## 58: 4 6 5 0 21
## 59: 5 4 3 0 21
## 60: 5 4 5 1 20
## 61: 3 7 5 0 20
## 62: 2 5 4 0 20
## 63: 2 8 2 0 19
## 64: 2 5 5 0 19
## 65: 4 7 3 0 19
## 66: 1 6 3 0 19
## 67: 1 6 7 0 19
## 68: 3 2 5 0 19
## 69: 3 4 3 0 18
## 70: 3 3 3 0 18
## 71: 5 1 4 0 18
## 72: 5 3 3 0 18
## 73: 2 4 4 0 18
## 74: 0 2 6 0 17
## 75: 4 5 2 0 17
## 76: 4 3 5 0 17
## 77: 6 5 1 0 17
## 78: 2 5 6 0 16
## 79: 2 3 6 0 16
## 80: 5 5 5 0 16
## 81: 2 3 6 0 16
## 82: 4 2 3 0 15
## 83: 4 2 2 0 15
## 84: 2 2 6 0 15
## 85: 3 2 6 0 15
## 86: 2 2 3 0 12
## 87: 0 3 5 0 12
## 88: 4 3 2 0 12
## 89: 3 1 4 0 11
## 90: 3 0 2 0 11
## 91: 3 2 2 0 10
## 92: 4 0 2 0 9
## 93: 2 3 0 0 9
## 94: 3 1 3 0 9
## 95: 4 0 1 0 7
## 96: 3 1 0 0 6
## In_Frame_Ins Missense_Mutation Nonsense_Mutation Nonstop_Mutation total
maf.titv = titv(maf = maf_object, plot = FALSE, useSyn = TRUE)
#plot titv summary
plotTiTv(res = maf.titv)
tbl <- table(expanded_s_anno_df$Tumor_Sample_Barcode)
# 2) Ordena desc e vê as top 10
top10 <- sort(tbl, decreasing=TRUE)[1:10]
print(top10)
##
## HPC124 HPC112 HPC79 HPC109 HPC495 HPC120 HPC201 HPC397 HPC164 HPC520
## 38 36 36 34 34 33 33 33 32 32
gene_summary <- getGeneSummary(maf_object)
top_genes <- gene_summary$Hugo_Symbol[1:10]
print(top_genes)
## [1] "PABPC3" "ASPN" "ATN1" "NOTCH4" "HLA-DRB5" "MAGI1"
## [7] "NCF1" "EPHB6" "CHST15" "PTP4A1"
lollipopPlot(maf = maf_object,
gene = 'PABPC3',
AACol='aaChange',
showMutationRate = TRUE)
lollipopPlot(maf = maf_object,
gene = 'ASPN',
AACol='aaChange',
showMutationRate = TRUE)
## 2 transcripts available. Use arguments refSeqID or proteinID to manually specify tx name.
## HGNC refseq.ID protein.ID aa.length
## <char> <char> <char> <num>
## 1: ASPN NM_001193335 NP_001180264 242
## 2: ASPN NM_017680 NP_060150 379
## Using longer transcript NM_017680 for now.
# Top 10 amostras por número de variantes
top10 <- sort(tbl, decreasing = TRUE)[1:10]
# monta o data.frame
top10_df <- data.frame(
Sample = names(top10),
Count = as.integer(top10),
stringsAsFactors = FALSE
)
# Reordena os níveis de Sample pelo Count
top10_df$Sample <- factor(
top10_df$Sample,
levels = top10_df$Sample[order(top10_df$Count)]
)
# gráfico com ggplot2
ggplot(top10_df, aes(x = Sample, y = Count, fill = Sample)) +
geom_col(width = 0.7, show.legend = FALSE) +
geom_text(
aes(label = Count),
vjust = -0.5,
size = 2.5,
color = "black"
) +
scale_fill_brewer(palette = "Set3") +
labs(
title = "Top 10 Amostras Mais Mutadas",
x = "Amostra",
y = "Número de Mutações"
) +
theme_minimal(base_size = 14) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(face = "bold", hjust = 0.5)
)
rainfallPlot(
maf = subsetMaf(maf = maf_object, tsb = "HPC124"),
detectChangePoints = TRUE,
pointSize = 0.8
)
## -Processing clinical data
## Processing HPC124..
## No changepoints detected!
maf.sig = oncodrive(maf = maf_object, AACol='aaChange', minMut = 5, pvalMethod = 'zscore')
## Warning in oncodrive(maf = maf_object, AACol = "aaChange", minMut = 5,
## pvalMethod = "zscore"): Oncodrive has been superseeded by OncodriveCLUSTL. See
## http://bg.upf.edu/group/projects/oncodrive-clust.php
## No syn mutations found! Skipping background estimation. Using predefined values. (Mean = 0.279; SD = 0.13)
## Estimating cluster scores from non-syn variants..
## | | | 0% | |== | 2% | |=== | 5% | |===== | 7% | |======= | 9% | |======== | 12% | |========== | 14% | |=========== | 16% | |============= | 19% | |=============== | 21% | |================ | 23% | |================== | 26% | |==================== | 28% | |===================== | 30% | |======================= | 33% | |======================== | 35% | |========================== | 37% | |============================ | 40% | |============================= | 42% | |=============================== | 44% | |================================= | 47% | |================================== | 49% | |==================================== | 51% | |===================================== | 53% | |======================================= | 56% | |========================================= | 58% | |========================================== | 60% | |============================================ | 63% | |============================================== | 65% | |=============================================== | 67% | |================================================= | 70% | |================================================== | 72% | |==================================================== | 74% | |====================================================== | 77% | |======================================================= | 79% | |========================================================= | 81% | |=========================================================== | 84% | |============================================================ | 86% | |============================================================== | 88% | |=============================================================== | 91% | |================================================================= | 93% | |=================================================================== | 95% | |==================================================================== | 98% | |======================================================================| 100%
## Comapring with background model and estimating p-values..
## Done !
plotOncodrive(res = maf.sig, fdrCutOff = 0.1, useFraction = TRUE, labelSize = 0.5)
Expandir por familia em vez de samples:
#por familia:
expanded_f_anno_df <- filtered_anno_df %>%
separate_rows(Family, sep = ",") %>%
mutate(Family = trimws(Family))
nrow(expanded_f_anno_df)
## [1] 773
# Contar quantas variantes cada família tem em anno_df_final
family_counts <- expanded_f_anno_df %>%
group_by(Family) %>%
summarise(n_variants = n(), .groups="drop") %>%
arrange(desc(n_variants))
ggplot(family_counts, aes(x = reorder(Family, n_variants), y = n_variants, fill = n_variants)) +
geom_col(width = 0.5, show.legend = FALSE) +
geom_text(aes(label = n_variants), hjust = -0.8, size = 2.5) +
scale_fill_viridis_c(option = "C", direction = -1) +
labs(
title = "Familias por numero de Mutacoes",
x = "Familia",
y = "Numero de Variantes"
) +
coord_flip() +
theme_minimal(base_size = 12) +
theme(
axis.text.y = element_text(size = 7),
plot.title = element_text(face = "bold", hjust = 0.5)
)
#Para cada (Hugo_Symbol, Family), mantem apenas uma linha (variants diferentes no mesmo gene-fam contam uma vez)
gene_family_distinct <- expanded_f_anno_df %>%
distinct(Hugo_Symbol, Family)
# Conta quantas famílias diferentes cada gene tem
gene_family_counts <- gene_family_distinct %>%
group_by(Hugo_Symbol) %>%
summarise(
n_families = n(), # número de famílias em que esse gene está mutado
.groups = "drop"
) %>%
arrange(desc(n_families))
# Veja as top 10
top10_genes_families <- gene_family_counts %>% slice_head(n = 11)
# filtrar o unknown
top10_genes_families <- top10_genes_families %>% filter(Hugo_Symbol!='Unknown')
# plote um barplot das top 10
ggplot(top10_genes_families, aes(x = reorder(Hugo_Symbol, n_families), y = n_families, fill = n_families)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = n_families), vjust = -0.3, size = 3.5) +
scale_fill_viridis_c(option = "C", direction = -1) +
labs(
title = "Top 10 Genes Most Frequently Mutated in Families",
x = "Gene",
y = "Number of Families"
) +
theme_minimal(base_size = 12) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(face = "bold", hjust = 0.5)
)
top30_genes <- gene_family_counts %>%
slice_head(n = 31) %>%
pull(Hugo_Symbol)
# Aqui, usamos cleaned_anno_df_family (que já explode cada variante por família)
# e contamos quantas mutações (linhas) cada (gene, família) tem, mas como variantes diferentes no
# mesmo (gene, família) queremos contar todas
gene_family_counts_detailed <- expanded_f_anno_df %>%
filter(Hugo_Symbol %in% top30_genes) %>% # manter só os Top 30 genes
group_by(Hugo_Symbol, Family) %>%
summarise(mutation_count = n(), .groups = "drop")
gene_family_counts_detailed<- gene_family_counts_detailed %>% filter(Hugo_Symbol!='Unknown')
# Transformar em matriz wide-format: colunas = famílias, linhas = genes, valores = mutation_count
gene_family_matrix <- gene_family_counts_detailed %>%
pivot_wider(names_from = Family,
values_from = mutation_count,
values_fill = 0)
# Ajustar a estrutura para pheatmap
gene_family_mat <- as.data.frame(gene_family_matrix)
rownames(gene_family_mat) <- gene_family_mat$Hugo_Symbol
gene_family_mat$Hugo_Symbol <- NULL
gene_family_mat <- as.matrix(gene_family_mat)
# plotar o heatmap dos Top 30 genes vs. famílias
pheatmap(
gene_family_mat,
cluster_rows = TRUE,
cluster_cols = TRUE,
color = colorRampPalette(c("white", "orange", "red"))(100),
main = "Heatmap: Numero de Mutacoes (Top 30 Genes) por Familia",
fontsize_row = 6,
fontsize_col = 7
)
pws = pathways(maf = maf_object, plotType = 'treemap')
## Summarizing signalling pathways [Sanchez-Vega et al., https://doi.org/10.1016/j.cell.2018.03.035]
plotPathways(maf = maf_object, pathlist = pws)
Graficos maftools por familia:
expanded_f_anno_df_2 <- filtered_anno_df %>%
separate_rows(Family, sep = ",") %>%
mutate(
Family = trimws(Family),
Tumor_Sample_Barcode = Family # usa a família como identificador principal
)
maf_fam_object <- read.maf(
maf = expanded_f_anno_df_2,
isTCGA = FALSE
)
## -Validating
## -Summarizing
## -Processing clinical data
## --Missing clinical data
## -Finished in 0.070s elapsed (0.050s cpu)
fam_metadata <- data.frame(
Tumor_Sample_Barcode = unique(expanded_f_anno_df$Family),
grupo = "grupoX", # ou outras variáveis por família
stringsAsFactors = FALSE
)
maf_fam_object@clinical.data <- merge(
getClinicalData(maf_fam_object),
fam_metadata,
by = "Tumor_Sample_Barcode",
all.x = TRUE,
sort = FALSE
)
oncoplot(
maf = maf_fam_object,
top = 20, # top 20 genes mais mutados
sortByAnnotation = TRUE, # ordena pelas colunas clínicas, se houver
showTumorSampleBarcodes = TRUE, # mostra nomes das "amostras" (aqui: famílias)
draw_titv = FALSE, # tira gráfico de Ti/Tv no topo
fontSize = 0.8,
removeNonMutated = TRUE
)
plotmafSummary(maf = maf_fam_object, rmOutlier = TRUE, addStat = 'median', dashboard = TRUE, titvRaw = FALSE)
mafbarplot(maf_fam_object)
maf.titv = titv(maf = maf_fam_object, plot = FALSE, useSyn = TRUE)
#plot titv summary
plotTiTv(res = maf.titv)
lollipopPlot(maf = maf_fam_object,
gene = 'PABPC3',
AACol='aaChange',
showMutationRate = TRUE)
lollipopPlot(maf = maf_fam_object,
gene = 'ASPN',
AACol='aaChange',
showMutationRate = TRUE)
## 2 transcripts available. Use arguments refSeqID or proteinID to manually specify tx name.
## HGNC refseq.ID protein.ID aa.length
## <char> <char> <char> <num>
## 1: ASPN NM_001193335 NP_001180264 242
## 2: ASPN NM_017680 NP_060150 379
## Using longer transcript NM_017680 for now.
library(clusterProfiler)
## Warning: package 'clusterProfiler' was built under R version 4.4.2
##
## clusterProfiler v4.14.6 Learn more at https://yulab-smu.top/contribution-knowledge-mining/
##
## Please cite:
##
## S Xu, E Hu, Y Cai, Z Xie, X Luo, L Zhan, W Tang, Q Wang, B Liu, R Wang,
## W Xie, T Wu, L Xie, G Yu. Using clusterProfiler to characterize
## multiomics data. Nature Protocols. 2024, 19(11):3292-3320
##
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:stats':
##
## filter
library(org.Hs.eg.db)
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, saveRDS, setdiff,
## table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Warning: package 'IRanges' was built under R version 4.4.2
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:clusterProfiler':
##
## rename
## The following objects are masked from 'package:data.table':
##
## first, second
## The following object is masked from 'package:tidyr':
##
## expand
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:utils':
##
## findMatches
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:clusterProfiler':
##
## slice
## The following object is masked from 'package:data.table':
##
## shift
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:grDevices':
##
## windows
##
## Attaching package: 'AnnotationDbi'
## The following object is masked from 'package:clusterProfiler':
##
## select
## The following object is masked from 'package:dplyr':
##
## select
##
# Lista original
gene_list <- gene_family_counts$Hugo_Symbol[1:20]
# Mapeamento de símbolos para ENTREZID
entrez_ids <- bitr(
gene_list,
fromType = "SYMBOL",
toType = "ENTREZID",
OrgDb = org.Hs.eg.db
)
## 'select()' returned 1:1 mapping between keys and columns
entrez_ids_total <- bitr(
gene_family_counts$Hugo_Symbol,
fromType = "SYMBOL",
toType = "ENTREZID",
OrgDb = org.Hs.eg.db
)
## 'select()' returned 1:1 mapping between keys and columns
## Warning in bitr(gene_family_counts$Hugo_Symbol, fromType = "SYMBOL", toType =
## "ENTREZID", : 1.19% of input gene IDs are fail to map...
GO (Gene Ontology) – enrichGO()
ego <- enrichGO(
gene = entrez_ids$ENTREZID,
OrgDb = org.Hs.eg.db,
ont = "BP",
pvalueCutoff = 0.05,
qvalueCutoff = 0.2,
readable = TRUE
)
dotplot(ego, showCategory = 20)
KEGG Pathways – enrichKEGG()
ekegg <- enrichKEGG(
gene = entrez_ids$ENTREZID,
organism = "hsa",
pvalueCutoff = 0.05
)
## Reading KEGG annotation online: "https://rest.kegg.jp/link/hsa/pathway"...
## Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/hsa"...
dotplot(ekegg, showCategory = 20)
pws = pathways(maf = maf_fam_object, plotType = 'treemap')
## Summarizing signalling pathways [Sanchez-Vega et al., https://doi.org/10.1016/j.cell.2018.03.035]
plotPathways(maf = maf_fam_object, pathlist = pws)
library(maftools)
oncodrive_res <- oncodrive(
maf = maf_fam_object,
AACol = "aaChange",
minMut = 5,
pvalMethod = "zscore"
)
## Warning in oncodrive(maf = maf_fam_object, AACol = "aaChange", minMut = 5, :
## Oncodrive has been superseeded by OncodriveCLUSTL. See
## http://bg.upf.edu/group/projects/oncodrive-clust.php
## No syn mutations found! Skipping background estimation. Using predefined values. (Mean = 0.279; SD = 0.13)
## Estimating cluster scores from non-syn variants..
## | | | 0% | |====== | 9% | |============= | 18% | |=================== | 27% | |========================= | 36% | |================================ | 45% | |====================================== | 55% | |============================================= | 64% | |=================================================== | 73% | |========================================================= | 82% | |================================================================ | 91% | |======================================================================| 100%
## Comapring with background model and estimating p-values..
## Done !
head(oncodrive_res$res)
## NULL
plotOncodrive(oncodrive_res, fdrCutOff = 0.05)
oncodrive_input <- expanded_f_anno_df_2 %>%
transmute(
CHROMOSOME = Chromosome,
POSITION = Start_Position,
REF = Reference_Allele,
ALT = Tumor_Seq_Allele2,
SAMPLE = Tumor_Sample_Barcode
)
write.table(
oncodrive_input,
file = "oncodrive_input.txt",
sep = "\t",
quote = FALSE,
row.names = FALSE
)
# Carrega pacotes necessários
library(dplyr)
library(ggplot2)
# Lê o output do OncodriveFML
res <- read.delim("input-oncodrivefml.tsv", stringsAsFactors = FALSE)
# Visualiza as primeiras linhas
head(res)
## GENE_ID MUTS MUTS_RECURRENCE SAMPLES P_VALUE Q_VALUE P_VALUE_NEG
## 1 ENSG00000151846 56 2 36 1.00e-07 0.00000460 1.00000
## 2 ENSG00000131951 2 1 2 1.20e-04 0.00276000 0.99998
## 3 ENSG00000148288 2 1 2 8.00e-04 0.01226667 0.99967
## 4 ENSG00000185069 2 1 2 1.27e-03 0.01460500 0.99895
## 5 ENSG00000168594 2 1 2 2.44e-03 0.02244800 0.99831
## 6 ENSG00000115828 2 1 2 3.98e-03 0.03051333 0.99623
## Q_VALUE_NEG SNP MNP INDELS SYMBOL
## 1 1 56 0 0 PABPC3
## 2 1 2 0 0 LRRC9
## 3 1 2 0 0 GBGT1
## 4 1 2 0 0 KRT76
## 5 1 0 0 2 ADAM29
## 6 1 2 0 0 QPCT
# Selecionar genes com Q_VALUE < 0.1
sig_genes <- res %>%
filter(Q_VALUE < 0.1) %>%
arrange(Q_VALUE)
# Verifica o top 10
head(sig_genes[, c("SYMBOL", "MUTS", "SAMPLES", "P_VALUE", "Q_VALUE")], 10)
## SYMBOL MUTS SAMPLES P_VALUE Q_VALUE
## 1 PABPC3 56 36 0.0000001 0.00000460
## 2 LRRC9 2 2 0.0001200 0.00276000
## 3 GBGT1 2 2 0.0008000 0.01226667
## 4 KRT76 2 2 0.0012700 0.01460500
## 5 ADAM29 2 2 0.0024400 0.02244800
## 6 QPCT 2 2 0.0039800 0.03051333
## 7 KCNJ12 2 2 0.0052300 0.03436857
## 8 DCDC1 2 2 0.0069300 0.03984750
## 9 ASCL3 2 2 0.0136000 0.06951111
## 10 ABCG5 2 2 0.0201500 0.09269000
# Exporta a tabela filtrada para usar em relatórios
write.table(
sig_genes,
file = "OncodriveFML_sig_genes_q0.1.tsv",
sep = "\t",
quote = FALSE,
row.names = FALSE
)
# Adiciona coluna de –log10(p‑value)
sig_genes <- sig_genes %>%
mutate(log10P = -log10(P_VALUE))
# Gráfico de barras horizontais: genes por –log10(p‑value)
ggplot(sig_genes, aes(
x = reorder(SYMBOL, log10P),
y = log10P,
fill = log10P
)) +
geom_col() +
coord_flip() +
labs(
x = "Gene",
y = "-log10(p_value)",
title = "Genes com FM_bias significativo (q < 0.1)",
subtitle = paste0("Total: ", nrow(sig_genes), " genes")
) +
scale_fill_viridis_c(option = "C") +
theme_minimal()